/*
Input: KommW2020_Straßenverzeichnis.xlsx
	> electoral roll 2020: raw addresses of all eligible voters and their assigned precinct

Output: tmp/strassen_kow20_voll_buchstabe.dta

Main task:
	> import, clean, and identify street segments that are assigned to a given precinct 
			-> a segment is equal to a streetname + range of numbers (e.g., Mystreet 6-18)
		>> note: a maximum number is not always given (e.g., singletons and "from 6 onwards" cases) 
			-> we generate max numbers by hand (e.g., 6 -> 6-6 and "from 6" -> 6-320 (hypothetical max))
		
	> from street segments extend data to include each single address (e.g., Mystreet 6, Mystreet 8, etc.)
		-> several cases to be considered:
		0. streets without any numbers // not the case in 2017
		 . address named exactly (border cases of the segments: if 6-10a -> 6 and 10a)
		1. beginning and ending of address contain only numbers and no letters (e.g., 1-15)
		 If segments with letters, different cases:
			2. only beginning of the segment contains a letter (e.g., 16a-24)
				3. only ending of the segment contains a letter (e.g., 16-24f)
			If segments contain letter at the beginning and the ending, different cases:
			4. assignment given for exactly one address (same street and same letter), e.g. 16a-16a
			5. assignment given for exactly one street (but with different addresses (letters)), e.g. 17b-17e
			6. first and second mentioned address contain letter and both have different numbers (e.g., 8b-11d)

*/



************************************************************************
**** 1. import, clean, and prepare for merging
************************************************************************
import excel using "$rawdata/election_office/electoral_rolls/KommW2020_Straßenverzeichnis.xlsx", clear

*clean file and rename
	replace A = subinstr(A, "‐", "-", .)
	replace B = subinstr(B, "‐", "-", .)
	drop if A=="Strasse"

	rename A street
	label var street "raw street from official source"
	rename B hnr_orig_segment
	rename C stadtbez
	lab var stadtbez "city district ID"
	rename D sb
	lab var sb "raw precinct id"

*Change street names for geocoding
	replace street = subinstr(street,"St.","Sankt",.)
	replace street = subinstr(street,"Südl.","Südliche",.)
	replace street = subinstr(street,"Alle","Allee",.)
	replace street = subinstr(street,"Alleee","Allee",.)
	replace street = subinstr(street,"pl.","platz",.)
	replace street = subinstr(street,"Pl.","Platz",.)
	replace street = subinstr(street,"str.","straße",.)
	replace street = subinstr(street,"Str.","Straße",.)
	replace street = subinstr(street,"straße","strasse",.)
	replace street = subinstr(street,"Straße","Strasse",.)

	replace street="Allensteiner Strasse" if street=="Alleensteiner Strasse"
	replace street="Allescherstrasse" if street=="Alleescherstrasse"

* gen: merge_strasse := harmonized street name
	rename 	street merge_strasse
	lab var merge_strasse "harmonized street id for merging"

	replace merge_strasse = subinstr(merge_strasse,"Dr.","Doktor",.)
	replace merge_strasse = subinstr(merge_strasse,"Prof.","Professor",.)
	replace merge_strasse = subinstr(merge_strasse," ","",.)
	replace merge_strasse = lower(merge_strasse)
	replace merge_strasse = subinstr(merge_strasse,"-","",.)
	replace merge_strasse = subinstr(merge_strasse,".","",.)
	replace merge_strasse = subinstr(merge_strasse,"'","",.)
	replace merge_strasse = subinstr(merge_strasse,"ß","ss",.)
	replace merge_strasse = subinstr(merge_strasse,"ä","ae",.)
	replace merge_strasse = subinstr(merge_strasse,"ö","oe",.)
	replace merge_strasse = subinstr(merge_strasse,"ü","ue",.)
	
*drop spaces in segments
	replace hnr_orig_segment = subinstr(hnr_orig_segment, " ", "", .)

*split segments into single components
* 	Note: hnr_orig_segment singletons are in format 15-15
	split hnr_orig_segment, p("-")
	
	*assert typical typos
	assert substr(hnr_orig_segment2,1,1)!="0"
	assert strpos(hnr_orig_segment1, ",")==0
	// assert only one split 
	cap confirm var hnr_orig_segment3
	assert _rc
	// assert: hnr_orig_segment1 and hnr_orig_segment2 are non-missing (unless no street number given at all)
	assert !missing(hnr_orig_segment1) == !missing(hnr_orig_segment2)
	
* get numeric part from single components (first and second (=last))
	gen hnr_min = real(ustrregexs(1)) if ustrregexm(hnr_orig_segment1,"([0-9]+)")
	gen hnr_max = real(ustrregexs(1)) if ustrregexm(hnr_orig_segment2,"([0-9]+)")
	tostring hnr_min hnr_max, replace

* get remaining characters from single components (=address letters)
	gen buchstaben_min = subinstr(hnr_orig_segment1, hnr_min, "", .)
	gen buchstaben_max = subinstr(hnr_orig_segment2, hnr_max, "", .)
	destring hnr_min hnr_max, replace

* generate which numbers from the single components are even/odd
*	even-even (e.g., 2-16) and odd-odd (1-17) refer to opposite sides of a street
*	Note: no cases of even-odd or odd-even
*	Note: if any hnr_orig_segment is missing vars even and odd are 0
	gen even = (mod(hnr_min, 2) == 0 & mod(hnr_max, 2) == 0) // Dummy=1 if min & max = even, 0 else (odd/no hnr_orig_segment) // single digit addresses always given as 15-15
	gen odd = (mod(hnr_min, 2) == 1 & mod(hnr_max, 2) == 1)
	assert even+odd==1 if !missing(hnr_orig_segment) // (no cases of odd-even or even-odd)

* set vars to missing when no street number given
	replace even=. if missing(hnr_orig_segment)
	replace odd=. if missing(hnr_orig_segment)



************************************************************************
**** 2. start generating addresses
************************************************************************
//go through different cases
frame copy default tmp, replace
frame tmp {
*case0: streets without numbers
	keep if missing(hnr_orig_segment)

* keep variables
	keep stadtbez sb merge_strasse

* create duplicates for each street to add numbers
	expand 321 // 321 is chosen as no street contains a higher number (checked)

* generate address number
	bys stadtbez merge_strasse: assert _N==321
	bys stadtbez merge_strasse: gen hnr = _n
	
* create duplicates to add letters (a-z) to each number
	expand 27
	bys stadtbez merge_strasse hnr: gen buchstabe = _n
	tostring buchstabe, replace

* recode the numeric values to letters
	replace buchstabe="a" if buchstabe=="1"
	replace buchstabe="b" if buchstabe=="2"
	replace buchstabe="c" if buchstabe=="3"
	replace buchstabe="d" if buchstabe=="4"
	replace buchstabe="e" if buchstabe=="5"
	replace buchstabe="f" if buchstabe=="6"
	replace buchstabe="g" if buchstabe=="7"
	replace buchstabe="h" if buchstabe=="8"
	replace buchstabe="i" if buchstabe=="9"
	replace buchstabe="j" if buchstabe=="10"
	replace buchstabe="k" if buchstabe=="11"
	replace buchstabe="l" if buchstabe=="12"
	replace buchstabe="m" if buchstabe=="13"
	replace buchstabe="n" if buchstabe=="14"
	replace buchstabe="o" if buchstabe=="15"
	replace buchstabe="p" if buchstabe=="16"
	replace buchstabe="q" if buchstabe=="17"
	replace buchstabe="r" if buchstabe=="18"
	replace buchstabe="s" if buchstabe=="19"
	replace buchstabe="t" if buchstabe=="20"
	replace buchstabe="u" if buchstabe=="21"
	replace buchstabe="v" if buchstabe=="22"
	replace buchstabe="w" if buchstabe=="23"
	replace buchstabe="x" if buchstabe=="24"
	replace buchstabe="y" if buchstabe=="25"
	replace buchstabe="z" if buchstabe=="26"
	replace buchstabe="" if buchstabe=="27"
	
*save tempfile (appended below)
	tempfile full_buchstabe 
	save `full_buchstabe'
}


* drop cases dealt with
	drop if missing(hnr_orig_segment)
	assert !missing(hnr_min)
	assert !missing(hnr_max)

	
	
frame copy default tmp, replace
frame tmp {
*case.x: address named exactly (border cases of the segments: if 6-10a -> 6 and 10a)
* expand for first or second part
	expand 2

* gen identifier for first and second part
	bys stadtbez sb merge_strasse hnr_orig_segment: gen tmp1 = _n // 1 or 2
	bys stadtbez sb merge_strasse hnr_orig_segment: assert tmp1<=2
	
* gen address number (numeric and letter part)
	gen hnr = hnr_min if tmp1==1
	replace hnr = hnr_max if tmp1==2
	gen buchstabe = "."
	replace buchstabe = buchstaben_min if tmp1==1
	replace buchstabe = buchstaben_max if tmp1==2
	drop tmp1

* keep relevant variables
	keep merge_strasse hnr buchstabe sb stadtbez

* drop duplicates if min and max are equal
	bys merge_strasse hnr buchstabe sb stadtbez: keep if _n==1 

*save tempfile (appended below)
	tempfile exact 
	save `exact'
}


* destring 
	destring hnr_min hnr_max, replace
	assert hnr_min<=hnr_max

// split segments to the corresponding numbers (what is afterward still missing are potential letters belonging to addresses)
* tmp1: county number of distinct street numbers b/w hnr_min and hnr_max generate a variable containing how many different numbers a segment contains
	gen tmp1 = hnr_max - hnr_min + 1

* expand to number of distinct street numbers
	expand tmp1
	drop tmp1

* tmp3: generate the address number
	bys stadtbez merge_strasse hnr_orig_segment: gen tmp3 = _n
	replace tmp3 = tmp3 - 1 + hnr_min

* segments contain only even/odd numbers. generated numbers that do not fit are removed 
	drop if odd==1 & mod(tmp3, 2) == 0
	drop if even==1 & mod(tmp3, 2) == 1

* drop
	drop hnr_orig_segment
	
* rename address number variable 
	rename tmp3 hnr
	
	
	
frame copy default tmp, replace
frame tmp {
*case1: beginning and ending of segment contain only numbers and no letters
	keep if missing(buchstaben_min) & missing(buchstaben_max)

* generate duplicates for each letter but drop duplicates again if they are before the mentioned letter
	expand 27
	bys stadtbez merge_strasse hnr hnr_orig_segment1 hnr_orig_segment2: assert(_N==27)
	bys stadtbez merge_strasse hnr hnr_orig_segment1 hnr_orig_segment2: gen buchstabe = _n

* recode the numeric values to letters again
	tostring buchstabe, replace
	replace buchstabe="a" if buchstabe=="1"
	replace buchstabe="b" if buchstabe=="2"
	replace buchstabe="c" if buchstabe=="3"
	replace buchstabe="d" if buchstabe=="4"
	replace buchstabe="e" if buchstabe=="5"
	replace buchstabe="f" if buchstabe=="6"
	replace buchstabe="g" if buchstabe=="7"
	replace buchstabe="h" if buchstabe=="8"
	replace buchstabe="i" if buchstabe=="9"
	replace buchstabe="j" if buchstabe=="10"
	replace buchstabe="k" if buchstabe=="11"
	replace buchstabe="l" if buchstabe=="12"
	replace buchstabe="m" if buchstabe=="13"
	replace buchstabe="n" if buchstabe=="14"
	replace buchstabe="o" if buchstabe=="15"
	replace buchstabe="p" if buchstabe=="16"
	replace buchstabe="q" if buchstabe=="17"
	replace buchstabe="r" if buchstabe=="18"
	replace buchstabe="s" if buchstabe=="19"
	replace buchstabe="t" if buchstabe=="20"
	replace buchstabe="u" if buchstabe=="21"
	replace buchstabe="v" if buchstabe=="22"
	replace buchstabe="w" if buchstabe=="23"
	replace buchstabe="x" if buchstabe=="24"
	replace buchstabe="y" if buchstabe=="25"
	replace buchstabe="z" if buchstabe=="26"
	replace buchstabe="" if buchstabe=="27"

* keep only relevant variables
	drop even odd hnr_min hnr_max  buchstaben_min buchstaben_max hnr_orig_segment1 hnr_orig_segment2 

*save tempfile (appended below)
	tempfile no_buchstabe 
	save `no_buchstabe'
}


* drop cases dealt with
	drop if missing(buchstaben_min) & missing(buchstaben_max)

frame copy default tmp, replace
frame tmp {
*case 2: only beginning of the segment contains a letter (buchstaben_min)
	keep if missing(buchstaben_min)==0 & missing(buchstaben_max)
	
* generate numeric variable for the letter mentioned in the address (buchstaben_min)
	gen tmp_buch = "1" if buchstaben_min=="a"
	replace tmp_buch = "2" if buchstaben_min=="b"
	replace tmp_buch = "3" if buchstaben_min=="c"
	replace tmp_buch = "4" if buchstaben_min=="d"
	replace tmp_buch = "5" if buchstaben_min=="e"
	replace tmp_buch = "6" if buchstaben_min=="f"
	replace tmp_buch = "7" if buchstaben_min=="g"
	replace tmp_buch = "8" if buchstaben_min=="h"
	replace tmp_buch = "9" if buchstaben_min=="i"
	replace tmp_buch = "10" if buchstaben_min=="j"
	replace tmp_buch = "11" if buchstaben_min=="k"
	replace tmp_buch = "12" if buchstaben_min=="l"
	replace tmp_buch = "13" if buchstaben_min=="m"
	replace tmp_buch = "14" if buchstaben_min=="n"
	replace tmp_buch = "15" if buchstaben_min=="o"
	replace tmp_buch = "16" if buchstaben_min=="p"
	replace tmp_buch = "17" if buchstaben_min=="q"
	replace tmp_buch = "18" if buchstaben_min=="r"
	replace tmp_buch = "19" if buchstaben_min=="s"
	replace tmp_buch = "20" if buchstaben_min=="t"
	replace tmp_buch = "21" if buchstaben_min=="u"
	replace tmp_buch = "22" if buchstaben_min=="v"
	replace tmp_buch = "23" if buchstaben_min=="w"
	replace tmp_buch = "24" if buchstaben_min=="x"
	replace tmp_buch = "25" if buchstaben_min=="y"
	replace tmp_buch = "26" if buchstaben_min=="z"
	destring tmp_buch, replace

* generate duplicates for each letter but drop duplicates again if they are before the mentioned letter
	expand 27
	bys stadtbez merge_strasse hnr hnr_orig_segment1: assert(_N==27)
	bys stadtbez merge_strasse hnr hnr_orig_segment1: gen buchstabe = _n
	replace buchstabe = buchstabe-1
	drop if buchstabe < tmp_buch & hnr==hnr_min

* recode the numeric values to letters again
	tostring buchstabe, replace
	replace buchstabe="" if buchstabe=="0"
	replace buchstabe="a" if buchstabe=="1"
	replace buchstabe="b" if buchstabe=="2"
	replace buchstabe="c" if buchstabe=="3"
	replace buchstabe="d" if buchstabe=="4"
	replace buchstabe="e" if buchstabe=="5"
	replace buchstabe="f" if buchstabe=="6"
	replace buchstabe="g" if buchstabe=="7"
	replace buchstabe="h" if buchstabe=="8"
	replace buchstabe="i" if buchstabe=="9"
	replace buchstabe="j" if buchstabe=="10"
	replace buchstabe="k" if buchstabe=="11"
	replace buchstabe="l" if buchstabe=="12"
	replace buchstabe="m" if buchstabe=="13"
	replace buchstabe="n" if buchstabe=="14"
	replace buchstabe="o" if buchstabe=="15"
	replace buchstabe="p" if buchstabe=="16"
	replace buchstabe="q" if buchstabe=="17"
	replace buchstabe="r" if buchstabe=="18"
	replace buchstabe="s" if buchstabe=="19"
	replace buchstabe="t" if buchstabe=="20"
	replace buchstabe="u" if buchstabe=="21"
	replace buchstabe="v" if buchstabe=="22"
	replace buchstabe="w" if buchstabe=="23"
	replace buchstabe="x" if buchstabe=="24"
	replace buchstabe="y" if buchstabe=="25"
	replace buchstabe="z" if buchstabe=="26"
	
* keep only relevant variables
	drop even odd hnr_min hnr_max  buchstaben_min buchstaben_max hnr_orig_segment1 hnr_orig_segment2 tmp_buch
	
*save tempfile (appended below)
	tempfile buchstabe_min 
	save `buchstabe_min'
}


frame copy default tmp, replace
frame tmp {
*case 3: only ending of the segment contains a letter (buchstaben_max)
	keep if missing(buchstaben_min) & missing(buchstaben_max)==0
	
* generate numeric variable for the letter mentioned in the address (buchstaben_max)
	gen tmp_buch = "1" if buchstaben_max=="a"
	replace tmp_buch = "2" if buchstaben_max=="b"
	replace tmp_buch = "3" if buchstaben_max=="c"
	replace tmp_buch = "4" if buchstaben_max=="d"
	replace tmp_buch = "5" if buchstaben_max=="e"
	replace tmp_buch = "6" if buchstaben_max=="f"
	replace tmp_buch = "7" if buchstaben_max=="g"
	replace tmp_buch = "8" if buchstaben_max=="h"
	replace tmp_buch = "9" if buchstaben_max=="i"
	replace tmp_buch = "10" if buchstaben_max=="j"
	replace tmp_buch = "11" if buchstaben_max=="k"
	replace tmp_buch = "12" if buchstaben_max=="l"
	replace tmp_buch = "13" if buchstaben_max=="m"
	replace tmp_buch = "14" if buchstaben_max=="n"
	replace tmp_buch = "15" if buchstaben_max=="o"
	replace tmp_buch = "16" if buchstaben_max=="p"
	replace tmp_buch = "17" if buchstaben_max=="q"
	replace tmp_buch = "18" if buchstaben_max=="r"
	replace tmp_buch = "19" if buchstaben_max=="s"
	replace tmp_buch = "20" if buchstaben_max=="t"
	replace tmp_buch = "21" if buchstaben_max=="u"
	replace tmp_buch = "22" if buchstaben_max=="v"
	replace tmp_buch = "23" if buchstaben_max=="w"
	replace tmp_buch = "24" if buchstaben_max=="x"
	replace tmp_buch = "25" if buchstaben_max=="y"
	replace tmp_buch = "26" if buchstaben_max=="z"
	destring tmp_buch, replace
	
* generate duplicates for each letter but drop duplicates again if they are before the mentioned letter
	expand 27
	bys stadtbez merge_strasse hnr hnr_orig_segment2: assert(_N==27)
	bys stadtbez merge_strasse hnr hnr_orig_segment2: gen buchstabe = _n
	replace buchstabe = buchstabe-1
	drop if buchstabe > tmp_buch & hnr==hnr_max
	
* recode the numeric values to letters again
	tostring buchstabe, replace
	replace buchstabe="" if buchstabe=="0"
	replace buchstabe="a" if buchstabe=="1"
	replace buchstabe="b" if buchstabe=="2"
	replace buchstabe="c" if buchstabe=="3"
	replace buchstabe="d" if buchstabe=="4"
	replace buchstabe="e" if buchstabe=="5"
	replace buchstabe="f" if buchstabe=="6"
	replace buchstabe="g" if buchstabe=="7"
	replace buchstabe="h" if buchstabe=="8"
	replace buchstabe="i" if buchstabe=="9"
	replace buchstabe="j" if buchstabe=="10"
	replace buchstabe="k" if buchstabe=="11"
	replace buchstabe="l" if buchstabe=="12"
	replace buchstabe="m" if buchstabe=="13"
	replace buchstabe="n" if buchstabe=="14"
	replace buchstabe="o" if buchstabe=="15"
	replace buchstabe="p" if buchstabe=="16"
	replace buchstabe="q" if buchstabe=="17"
	replace buchstabe="r" if buchstabe=="18"
	replace buchstabe="s" if buchstabe=="19"
	replace buchstabe="t" if buchstabe=="20"
	replace buchstabe="u" if buchstabe=="21"
	replace buchstabe="v" if buchstabe=="22"
	replace buchstabe="w" if buchstabe=="23"
	replace buchstabe="x" if buchstabe=="24"
	replace buchstabe="y" if buchstabe=="25"
	replace buchstabe="z" if buchstabe=="26"
	
* keep only relevant variables
	drop even odd hnr_min hnr_max  buchstaben_min buchstaben_max hnr_orig_segment1 hnr_orig_segment2 tmp_buch 
	
*save tempfile (appended below)
	tempfile buchstabe_max 
	save `buchstabe_max'
}


* drop cases dealt with so far
	keep if missing(buchstaben_min)==0 & missing(buchstaben_max)==0
	

frame copy default tmp, replace
frame tmp {
*case4: assinment given for exactly one address (same street and same letter)
	keep if hnr_min==hnr_max & buchstaben_min==buchstaben_max
	gen buchstabe = buchstaben_min
* keep only relevant variables
	drop even odd hnr_min hnr_max  buchstaben_min buchstaben_max hnr_orig_segment1 hnr_orig_segment2 
	
*save tempfile (appended below)
	tempfile one_address 
	save `one_address'
}


frame copy default tmp, replace
frame tmp {
*case5: assignment given for exactly one street number (but with different addresses (letters))
	keep if hnr_min==hnr_max & buchstaben_min!=buchstaben_max

* generate numeric variable for the first letter mentioned in the address (buchstaben_min)
	gen tmp_buch = "1" if buchstaben_min=="a"
	replace tmp_buch = "2" if buchstaben_min=="b"
	replace tmp_buch = "3" if buchstaben_min=="c"
	replace tmp_buch = "4" if buchstaben_min=="d"
	replace tmp_buch = "5" if buchstaben_min=="e"
	replace tmp_buch = "6" if buchstaben_min=="f"
	replace tmp_buch = "7" if buchstaben_min=="g"
	replace tmp_buch = "8" if buchstaben_min=="h"
	replace tmp_buch = "9" if buchstaben_min=="i"
	replace tmp_buch = "10" if buchstaben_min=="j"
	replace tmp_buch = "11" if buchstaben_min=="k"
	replace tmp_buch = "12" if buchstaben_min=="l"
	replace tmp_buch = "13" if buchstaben_min=="m"
	replace tmp_buch = "14" if buchstaben_min=="n"
	replace tmp_buch = "15" if buchstaben_min=="o"
	replace tmp_buch = "16" if buchstaben_min=="p"
	replace tmp_buch = "17" if buchstaben_min=="q"
	replace tmp_buch = "18" if buchstaben_min=="r"
	replace tmp_buch = "19" if buchstaben_min=="s"
	replace tmp_buch = "20" if buchstaben_min=="t"
	replace tmp_buch = "21" if buchstaben_min=="u"
	replace tmp_buch = "22" if buchstaben_min=="v"
	replace tmp_buch = "23" if buchstaben_min=="w"
	replace tmp_buch = "24" if buchstaben_min=="x"
	replace tmp_buch = "25" if buchstaben_min=="y"
	replace tmp_buch = "26" if buchstaben_min=="z"
	destring tmp_buch, replace

* generate numeric variable for the second letter mentioned in the address (buchstaben_max)
	gen tmp_buch2 = "1" if buchstaben_max=="a"
	replace tmp_buch2 = "2" if buchstaben_max=="b"
	replace tmp_buch2 = "3" if buchstaben_max=="c"
	replace tmp_buch2 = "4" if buchstaben_max=="d"
	replace tmp_buch2 = "5" if buchstaben_max=="e"
	replace tmp_buch2 = "6" if buchstaben_max=="f"
	replace tmp_buch2 = "7" if buchstaben_max=="g"
	replace tmp_buch2 = "8" if buchstaben_max=="h"
	replace tmp_buch2 = "9" if buchstaben_max=="i"
	replace tmp_buch2 = "10" if buchstaben_max=="j"
	replace tmp_buch2 = "11" if buchstaben_max=="k"
	replace tmp_buch2 = "12" if buchstaben_max=="l"
	replace tmp_buch2 = "13" if buchstaben_max=="m"
	replace tmp_buch2 = "14" if buchstaben_max=="n"
	replace tmp_buch2 = "15" if buchstaben_max=="o"
	replace tmp_buch2 = "16" if buchstaben_max=="p"
	replace tmp_buch2 = "17" if buchstaben_max=="q"
	replace tmp_buch2 = "18" if buchstaben_max=="r"
	replace tmp_buch2 = "19" if buchstaben_max=="s"
	replace tmp_buch2 = "20" if buchstaben_max=="t"
	replace tmp_buch2 = "21" if buchstaben_max=="u"
	replace tmp_buch2 = "22" if buchstaben_max=="v"
	replace tmp_buch2 = "23" if buchstaben_max=="w"
	replace tmp_buch2 = "24" if buchstaben_max=="x"
	replace tmp_buch2 = "25" if buchstaben_max=="y"
	replace tmp_buch2 = "26" if buchstaben_max=="z"
	destring tmp_buch2, replace

* generate duplicates for each letter but drop duplicates again if they are before the mentioned letter
	expand 26
	bys stadtbez merge_strasse hnr: assert(_N==26)
	bys stadtbez merge_strasse hnr: gen buchstabe = _n
	drop if buchstabe < tmp_buch
	drop if buchstabe > tmp_buch2
	tostring buchstabe, replace
	
* recode the numeric values to letters again
	replace buchstabe="a" if buchstabe=="1"
	replace buchstabe="b" if buchstabe=="2"
	replace buchstabe="c" if buchstabe=="3"
	replace buchstabe="d" if buchstabe=="4"
	replace buchstabe="e" if buchstabe=="5"
	replace buchstabe="f" if buchstabe=="6"
	replace buchstabe="g" if buchstabe=="7"
	replace buchstabe="h" if buchstabe=="8"
	replace buchstabe="i" if buchstabe=="9"
	replace buchstabe="j" if buchstabe=="10"
	replace buchstabe="k" if buchstabe=="11"
	replace buchstabe="l" if buchstabe=="12"
	replace buchstabe="m" if buchstabe=="13"
	replace buchstabe="n" if buchstabe=="14"
	replace buchstabe="o" if buchstabe=="15"
	replace buchstabe="p" if buchstabe=="16"
	replace buchstabe="q" if buchstabe=="17"
	replace buchstabe="r" if buchstabe=="18"
	replace buchstabe="s" if buchstabe=="19"
	replace buchstabe="t" if buchstabe=="20"
	replace buchstabe="u" if buchstabe=="21"
	replace buchstabe="v" if buchstabe=="22"
	replace buchstabe="w" if buchstabe=="23"
	replace buchstabe="x" if buchstabe=="24"
	replace buchstabe="y" if buchstabe=="25"
	replace buchstabe="z" if buchstabe=="26"

* keep only relevant variables
	drop even odd hnr_min hnr_max  buchstaben_min buchstaben_max hnr_orig_segment1 hnr_orig_segment2 tmp_buch tmp_buch2
	
*save tempfile (appended below)
	tempfile one_number 
	save `one_number'
}

* drop cases dealt with so far
	drop if hnr_min==hnr_max


*last remaining case (case6): first and second mentioned address contain letter and both have different numbers (e.g., 8b-11d)
	* generate numeric variable for the first letter mentioned in the address (buchstaben_min)
	gen tmp_buch = "1" if buchstaben_min=="a"
	replace tmp_buch = "2" if buchstaben_min=="b"
	replace tmp_buch = "3" if buchstaben_min=="c"
	replace tmp_buch = "4" if buchstaben_min=="d"
	replace tmp_buch = "5" if buchstaben_min=="e"
	replace tmp_buch = "6" if buchstaben_min=="f"
	replace tmp_buch = "7" if buchstaben_min=="g"
	replace tmp_buch = "8" if buchstaben_min=="h"
	replace tmp_buch = "9" if buchstaben_min=="i"
	replace tmp_buch = "10" if buchstaben_min=="j"
	replace tmp_buch = "11" if buchstaben_min=="k"
	replace tmp_buch = "12" if buchstaben_min=="l"
	replace tmp_buch = "13" if buchstaben_min=="m"
	replace tmp_buch = "14" if buchstaben_min=="n"
	replace tmp_buch = "15" if buchstaben_min=="o"
	replace tmp_buch = "16" if buchstaben_min=="p"
	replace tmp_buch = "17" if buchstaben_min=="q"
	replace tmp_buch = "18" if buchstaben_min=="r"
	replace tmp_buch = "19" if buchstaben_min=="s"
	replace tmp_buch = "20" if buchstaben_min=="t"
	replace tmp_buch = "21" if buchstaben_min=="u"
	replace tmp_buch = "22" if buchstaben_min=="v"
	replace tmp_buch = "23" if buchstaben_min=="w"
	replace tmp_buch = "24" if buchstaben_min=="x"
	replace tmp_buch = "25" if buchstaben_min=="y"
	replace tmp_buch = "26" if buchstaben_min=="z"
	destring tmp_buch, replace

* generate numeric variable for the second letter mentioned in the address (buchstaben_max)
	gen tmp_buch2 = "1" if buchstaben_max=="a"
	replace tmp_buch2 = "2" if buchstaben_max=="b"
	replace tmp_buch2 = "3" if buchstaben_max=="c"
	replace tmp_buch2 = "4" if buchstaben_max=="d"
	replace tmp_buch2 = "5" if buchstaben_max=="e"
	replace tmp_buch2 = "6" if buchstaben_max=="f"
	replace tmp_buch2 = "7" if buchstaben_max=="g"
	replace tmp_buch2 = "8" if buchstaben_max=="h"
	replace tmp_buch2 = "9" if buchstaben_max=="i"
	replace tmp_buch2 = "10" if buchstaben_max=="j"
	replace tmp_buch2 = "11" if buchstaben_max=="k"
	replace tmp_buch2 = "12" if buchstaben_max=="l"
	replace tmp_buch2 = "13" if buchstaben_max=="m"
	replace tmp_buch2 = "14" if buchstaben_max=="n"
	replace tmp_buch2 = "15" if buchstaben_max=="o"
	replace tmp_buch2 = "16" if buchstaben_max=="p"
	replace tmp_buch2 = "17" if buchstaben_max=="q"
	replace tmp_buch2 = "18" if buchstaben_max=="r"
	replace tmp_buch2 = "19" if buchstaben_max=="s"
	replace tmp_buch2 = "20" if buchstaben_max=="t"
	replace tmp_buch2 = "21" if buchstaben_max=="u"
	replace tmp_buch2 = "22" if buchstaben_max=="v"
	replace tmp_buch2 = "23" if buchstaben_max=="w"
	replace tmp_buch2 = "24" if buchstaben_max=="x"
	replace tmp_buch2 = "25" if buchstaben_max=="y"
	replace tmp_buch2 = "26" if buchstaben_max=="z"
	destring tmp_buch2, replace
	
* generate duplicates for each letter but drop duplicates again if they are before the mentioned letter
	expand 27
	bys stadtbez merge_strasse hnr hnr_orig_segment1 hnr_orig_segment2: assert(_N==27)
	bys stadtbez merge_strasse hnr hnr_orig_segment1 hnr_orig_segment2: gen buchstabe = _n
	replace buchstabe = buchstabe-1
	drop if buchstabe < tmp_buch & hnr_min==hnr
	drop if buchstabe > tmp_buch2 & hnr_max==hnr
	tostring buchstabe, replace
	
* recode the numeric values to letters again
	replace buchstabe="" if buchstabe=="0"
	replace buchstabe="a" if buchstabe=="1"
	replace buchstabe="b" if buchstabe=="2"
	replace buchstabe="c" if buchstabe=="3"
	replace buchstabe="d" if buchstabe=="4"
	replace buchstabe="e" if buchstabe=="5"
	replace buchstabe="f" if buchstabe=="6"
	replace buchstabe="g" if buchstabe=="7"
	replace buchstabe="h" if buchstabe=="8"
	replace buchstabe="i" if buchstabe=="9"
	replace buchstabe="j" if buchstabe=="10"
	replace buchstabe="k" if buchstabe=="11"
	replace buchstabe="l" if buchstabe=="12"
	replace buchstabe="m" if buchstabe=="13"
	replace buchstabe="n" if buchstabe=="14"
	replace buchstabe="o" if buchstabe=="15"
	replace buchstabe="p" if buchstabe=="16"
	replace buchstabe="q" if buchstabe=="17"
	replace buchstabe="r" if buchstabe=="18"
	replace buchstabe="s" if buchstabe=="19"
	replace buchstabe="t" if buchstabe=="20"
	replace buchstabe="u" if buchstabe=="21"
	replace buchstabe="v" if buchstabe=="22"
	replace buchstabe="w" if buchstabe=="23"
	replace buchstabe="x" if buchstabe=="24"
	replace buchstabe="y" if buchstabe=="25"
	replace buchstabe="z" if buchstabe=="26"



* keep only relevant variables
	drop even odd hnr_min hnr_max  buchstaben_min buchstaben_max hnr_orig_segment1 hnr_orig_segment2 tmp_buch tmp_buch2

   
*append all datasets created before   
	append using `full_buchstabe'
	append using `no_buchstabe'
	append using `buchstabe_min'
	append using `buchstabe_max'
	append using `one_address'
	append using `one_number'

*merge exact case (e.g., 4a in ab4a or 6a and 8b in 6a-8b) and drop merged addresses
*they might occur a second time with a different precinct due to partly imprecise electoral rolls 
	merge m:1 stadtbez sb merge_strasse hnr buchstabe using `exact', assert(1 3) keep(1) nogen
*append these exact cases where the correct precinct is known for sure
	append using `exact'	

*generate addresse number (number and letter)
	tostring hnr, replace 
	gen nummer = hnr + buchstabe
	lab var nummer "address number"

	* destring precinct and district ID
	replace sb = substr(sb, strlen(sb)-1, 2)
	replace sb = stadtbez + sb
	destring stadtbez sb, replace
	*tostring stadtbez sb, replace

* Handle duplicate addresses:
	* 1) duplicates within district across precincts: drop BOTH b/c definitive assignment to precinct not possible
	* 2) duplicates within precinct (sb): keep one copy
	duplicates drop stadtbez merge_strasse nummer sb, force
	bys stadtbez merge_strasse nummer: keep if _N==1

* keep only necessary variables
	keep merge_strasse nummer sb stadtbez

*save
	save "$tmp/strassen_kow20_voll_buchstabe.dta", replace
